# 预训练数据集构建
import pandas as pd


# 文本序列分段
def split_text(text, max_length):
    words = text.split()
    if len(words) <= max_length:
        return [text]

    result = []
    for i in range(0, len(words), max_length):
        chunk = ' '.join(words[i:i + max_length])
        result.append(chunk)
    return result


# 设置分段最大长度
max_length = 4096

file_path = 'train_set.csv'  # 文件路径
df = pd.read_csv(file_path, sep='\t')

# 提取出text列
df = df['text']

# 保存处理后的行
processed_rows = []

# 分割文本
for text in df:
    # 对每行text进行处理
    split_texts = split_text(text, max_length)
    processed_rows.extend(split_texts)

result_df = pd.DataFrame(processed_rows, columns=['text'])

# 保存文件
file_output = 'pretrained_data.csv'  # 输出路径
result_df.to_csv(file_output, sep='\t', index=False)
